"""
#
# Flow stability for dynamic community detection https://arxiv.org/abs/2101.06131v2
#
# Copyright (C) 2021 Alexandre Bovet <alexandre.bovet@maths.ox.ac.uk>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


This script computes the mapping between author IDs and their affiliations' countries.

results saved as `df_author_country.csv`

"""

import pandas as pd
import numpy as np
nproc = 6
from collections import Counter
from tqdm import tqdm

tqdm.pandas()

raise Exception
#%% 

df_edges = pd.read_csv('../paper_data/aps/all_journals_disamb_edges_countries.csv', index_col=0)



#%%

df_authors_countries = pd.DataFrame(data={'country':df_edges.n1_country.tolist() +\
                                          df_edges.n2_country.tolist(),
                                          'author':df_edges.n1.tolist() +\
                                              df_edges.n2.tolist()})

#%%

def find_country(df):
    
    # if 'doi' not in df_doi.columns:
    #     print(df_doi)
    try:
        
        c = Counter(df.country.str.split(',').explode().tolist())
        
        country = None
        i = 0
        while country is None and i < len(c.most_common()):
            country = c.most_common()[i][0]
            if isinstance(country, float) and np.isnan(country):
                country = None
            i += 1
            
        country_counts = ','.join([f'{country}:{count}' for country,count in c.most_common() if isinstance(country, str)])

        top_country = country_counts.split(':')[0]

        if df.country.notnull().sum() > 0:
            last_country = df.country.loc[df.country.notnull()].iloc[-1]
        else:
            last_country = None
        
        return pd.DataFrame(data={'top_country' : top_country,
                'last_country' : last_country,
                'country_counts' : country_counts},
                            index=[df.author.iloc[0]])
                            
        
    except Exception as e:
        print(df)
        raise e

df_auth_country = df_authors_countries.iloc[:].groupby('author').progress_apply(find_country)

#%%
df_auth_country.index = df_auth_country.index.droplevel(level=1)

df_auth_country.to_csv('../data/aps/df_author_country.csv')
